R Tutorial

An introduction to R


Introduction

This tutorial is will introduce the reader to , a free, open-source statistical computing environment often used with RStudio, a integrated development environment for .

R Project Logo


Calculator

can be used as a super awesome calculator

# 5 + 3 = 8
5 + 3 
## [1] 8
# 24 / (1 + 2) = 8
24 / (1 + 2) 
## [1] 8
# 2 * 2 * 2 = 8
2^3 
## [1] 8
# 8 * 8 = 64
sqrt(64) 
## [1] 8
# -log10(0.05 / 5000000) = 8
-log10(0.05 / 5000000) 
## [1] 8

Functions

has many useful built in functions

1:10
##  [1]  1  2  3  4  5  6  7  8  9 10
as.character(1:10)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
rep(1:2, times = 5)
##  [1] 1 2 1 2 1 2 1 2 1 2
rep(1:5, times = 2)
##  [1] 1 2 3 4 5 1 2 3 4 5
rep(1:5, each = 2)
##  [1] 1 1 2 2 3 3 4 4 5 5
rep(1:5, length.out = 7)
## [1] 1 2 3 4 5 1 2
seq(5, 50, by = 5)
##  [1]  5 10 15 20 25 30 35 40 45 50
seq(5, 50, length.out = 5)
## [1]  5.00 16.25 27.50 38.75 50.00
paste(1:10, 20:30, sep = "-")
##  [1] "1-20"  "2-21"  "3-22"  "4-23"  "5-24"  "6-25"  "7-26"  "8-27"  "9-28"  "10-29" "1-30"
paste(1:10, collapse = "-")
## [1] "1-2-3-4-5-6-7-8-9-10"
paste0("x", 1:10)
##  [1] "x1"  "x2"  "x3"  "x4"  "x5"  "x6"  "x7"  "x8"  "x9"  "x10"
min(1:10)
## [1] 1
max(1:10)
## [1] 10
range(1:10)
## [1]  1 10
mean(1:10)
## [1] 5.5
sd(1:10)
## [1] 3.02765

Custom Functions

Users can also create their own functions

customFunction1 <- function(x, y) {
  z <- 100 * x / (x + y)
  paste(z, "%")
}
customFunction1(x = 10, y = 90)
## [1] "10 %"
customFunction2 <- function(x) {
  mymin <- mean(x - sd(x))
  mymax <- mean(x) + sd(x)
  print(paste("Min =", mymin))
  print(paste("Max =", mymax))
}
customFunction2(x = 1:10)
## [1] "Min = 2.47234964590251"
## [1] "Max = 8.52765035409749"

for loops and if else statements

xx <- NULL #creates and empty object
for(i in 1:10) {
  xx[i] <- i*3
}
xx
##  [1]  3  6  9 12 15 18 21 24 27 30
xx %% 2 #gives the remainder when divided by 2
##  [1] 1 0 1 0 1 0 1 0 1 0
for(i in 1:length(xx)) {
  if((xx[i] %% 2) == 0) {
    print(paste(xx[i],"is Even"))
  } else { 
      print(paste(xx[i],"is Odd")) 
    }
}
## [1] "3 is Odd"
## [1] "6 is Even"
## [1] "9 is Odd"
## [1] "12 is Even"
## [1] "15 is Odd"
## [1] "18 is Even"
## [1] "21 is Odd"
## [1] "24 is Even"
## [1] "27 is Odd"
## [1] "30 is Even"
# or
ifelse(xx %% 2 == 0, "Even", "Odd")
##  [1] "Odd"  "Even" "Odd"  "Even" "Odd"  "Even" "Odd"  "Even" "Odd"  "Even"
paste(xx, ifelse(xx %% 2 == 0, "is Even", "is Odd"))
##  [1] "3 is Odd"   "6 is Even"  "9 is Odd"   "12 is Even" "15 is Odd"  "18 is Even" "21 is Odd"  "24 is Even" "27 is Odd"  "30 is Even"

Objects

Information can be stored in user defined objects, in multiple forms:

  • c(): a string of values
  • matrix(): a two dimensional matrix in one format
  • data.frame(): a two dimensional matrix where each column can be a different format
  • list():

A string…

xc <- 1:10
xc
##  [1]  1  2  3  4  5  6  7  8  9 10
xc <- c(1,2,3,4,5,6,7,8,9,10)
xc
##  [1]  1  2  3  4  5  6  7  8  9 10

A matrix…

xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = T)
xm
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1    2    3    4    5    6    7    8    9    10
##  [2,]   11   12   13   14   15   16   17   18   19    20
##  [3,]   21   22   23   24   25   26   27   28   29    30
##  [4,]   31   32   33   34   35   36   37   38   39    40
##  [5,]   41   42   43   44   45   46   47   48   49    50
##  [6,]   51   52   53   54   55   56   57   58   59    60
##  [7,]   61   62   63   64   65   66   67   68   69    70
##  [8,]   71   72   73   74   75   76   77   78   79    80
##  [9,]   81   82   83   84   85   86   87   88   89    90
## [10,]   91   92   93   94   95   96   97   98   99   100
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = F)
xm
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1   11   21   31   41   51   61   71   81    91
##  [2,]    2   12   22   32   42   52   62   72   82    92
##  [3,]    3   13   23   33   43   53   63   73   83    93
##  [4,]    4   14   24   34   44   54   64   74   84    94
##  [5,]    5   15   25   35   45   55   65   75   85    95
##  [6,]    6   16   26   36   46   56   66   76   86    96
##  [7,]    7   17   27   37   47   57   67   77   87    97
##  [8,]    8   18   28   38   48   58   68   78   88    98
##  [9,]    9   19   29   39   49   59   69   79   89    99
## [10,]   10   20   30   40   50   60   70   80   90   100

A data frame…

xd <- data.frame(
  x1 = c("aa","bb","cc","dd","ee",
         "ff","gg","hh","ii","jj"),
  x2 = 1:10,
  x3 = c(1,1,1,1,1,2,2,2,3,3),
  x4 = rep(c(1,2), times = 5),
  x5 = rep(1:5, times = 2),
  x6 = rep(1:5, each = 2),
  x7 = seq(5, 50, by = 5),
  x8 = log10(1:10),
  x9 = (1:10)^3,
  x10 = c(T,T,T,F,F,T,T,F,F,F)
)
xd
##    x1 x2 x3 x4 x5 x6 x7        x8   x9   x10
## 1  aa  1  1  1  1  1  5 0.0000000    1  TRUE
## 2  bb  2  1  2  2  1 10 0.3010300    8  TRUE
## 3  cc  3  1  1  3  2 15 0.4771213   27  TRUE
## 4  dd  4  1  2  4  2 20 0.6020600   64 FALSE
## 5  ee  5  1  1  5  3 25 0.6989700  125 FALSE
## 6  ff  6  2  2  1  3 30 0.7781513  216  TRUE
## 7  gg  7  2  1  2  4 35 0.8450980  343  TRUE
## 8  hh  8  2  2  3  4 40 0.9030900  512 FALSE
## 9  ii  9  3  1  4  5 45 0.9542425  729 FALSE
## 10 jj 10  3  2  5  5 50 1.0000000 1000 FALSE

A list…

xl <- list(xc, xm, xd)
xl[[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10
xl[[2]]
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    1   11   21   31   41   51   61   71   81    91
##  [2,]    2   12   22   32   42   52   62   72   82    92
##  [3,]    3   13   23   33   43   53   63   73   83    93
##  [4,]    4   14   24   34   44   54   64   74   84    94
##  [5,]    5   15   25   35   45   55   65   75   85    95
##  [6,]    6   16   26   36   46   56   66   76   86    96
##  [7,]    7   17   27   37   47   57   67   77   87    97
##  [8,]    8   18   28   38   48   58   68   78   88    98
##  [9,]    9   19   29   39   49   59   69   79   89    99
## [10,]   10   20   30   40   50   60   70   80   90   100
xl[[3]]
##    x1 x2 x3 x4 x5 x6 x7        x8   x9   x10
## 1  aa  1  1  1  1  1  5 0.0000000    1  TRUE
## 2  bb  2  1  2  2  1 10 0.3010300    8  TRUE
## 3  cc  3  1  1  3  2 15 0.4771213   27  TRUE
## 4  dd  4  1  2  4  2 20 0.6020600   64 FALSE
## 5  ee  5  1  1  5  3 25 0.6989700  125 FALSE
## 6  ff  6  2  2  1  3 30 0.7781513  216  TRUE
## 7  gg  7  2  1  2  4 35 0.8450980  343  TRUE
## 8  hh  8  2  2  3  4 40 0.9030900  512 FALSE
## 9  ii  9  3  1  4  5 45 0.9542425  729 FALSE
## 10 jj 10  3  2  5  5 50 1.0000000 1000 FALSE

Selecting Data

xc[5] # 5th element in xc
## [1] 5
xd$x3[5] # 5th element in col "x3"
## [1] 1
xd[5,"x3"] # row 5, col "x3"
## [1] 1
xd$x3 # all of col "x3"
##  [1] 1 1 1 1 1 2 2 2 3 3
xd[,"x3"] # all rows, col "x3"
##  [1] 1 1 1 1 1 2 2 2 3 3
xd[3,] # row 3, all cols
##   x1 x2 x3 x4 x5 x6 x7        x8 x9  x10
## 3 cc  3  1  1  3  2 15 0.4771213 27 TRUE
xd[c(2,4),c("x4","x5")] # rows 2 & 4, cols "x4" & "x5"
##   x4 x5
## 2  2  2
## 4  2  4
xl[[3]]$x1 # 3rd object in the list, col "x1
##  [1] "aa" "bb" "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj"

regexpr

xx <- data.frame(Name = c("Item 1 (detail 1)",
                          "Item 20 (detail 20)",
                          "Item 300 (detail 300)"),
                 Item = NA,
                 Detail = NA)
xx$Detail <- substr(xx$Name, regexpr("\\(", xx$Name)+1, regexpr("\\)", xx$Name)-1)
xx$Item <- substr(xx$Name, 1, regexpr("\\(", xx$Name)-2)
xx
##                    Name     Item     Detail
## 1     Item 1 (detail 1)   Item 1   detail 1
## 2   Item 20 (detail 20)  Item 20  detail 20
## 3 Item 300 (detail 300) Item 300 detail 300

Data Formats

Data can also be saved in many formats:

  • numeric
  • integer
  • character
  • factor
  • logical
xd$x3 <- as.character(xd$x3)
xd$x3
##  [1] "1" "1" "1" "1" "1" "2" "2" "2" "3" "3"
xd$x3 <- as.numeric(xd$x3)
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
xd$x3 <- as.factor(xd$x3)
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 1 2 3
xd$x3 <- factor(xd$x3, levels = c("3","2","1"))
xd$x3
##  [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 3 2 1
xd$x10
##  [1]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
as.numeric(xd$x10) # TRUE = 1, FALSE = 0
##  [1] 1 1 1 0 0 1 1 0 0 0
sum(xd$x10)
## [1] 5

Internal structure of an object can be checked with str()

str(xc) # c()
##  num [1:10] 1 2 3 4 5 6 7 8 9 10
str(xm) # matrix()
##  int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
str(xd) # data.frame()
## 'data.frame':    10 obs. of  10 variables:
##  $ x1 : chr  "aa" "bb" "cc" "dd" ...
##  $ x2 : int  1 2 3 4 5 6 7 8 9 10
##  $ x3 : Factor w/ 3 levels "3","2","1": 3 3 3 3 3 2 2 2 1 1
##  $ x4 : num  1 2 1 2 1 2 1 2 1 2
##  $ x5 : int  1 2 3 4 5 1 2 3 4 5
##  $ x6 : int  1 1 2 2 3 3 4 4 5 5
##  $ x7 : num  5 10 15 20 25 30 35 40 45 50
##  $ x8 : num  0 0.301 0.477 0.602 0.699 ...
##  $ x9 : num  1 8 27 64 125 216 343 512 729 1000
##  $ x10: logi  TRUE TRUE TRUE FALSE FALSE TRUE ...
str(xl) # list()
## List of 3
##  $ : num [1:10] 1 2 3 4 5 6 7 8 9 10
##  $ : int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
##  $ :'data.frame':    10 obs. of  10 variables:
##   ..$ x1 : chr [1:10] "aa" "bb" "cc" "dd" ...
##   ..$ x2 : int [1:10] 1 2 3 4 5 6 7 8 9 10
##   ..$ x3 : num [1:10] 1 1 1 1 1 2 2 2 3 3
##   ..$ x4 : num [1:10] 1 2 1 2 1 2 1 2 1 2
##   ..$ x5 : int [1:10] 1 2 3 4 5 1 2 3 4 5
##   ..$ x6 : int [1:10] 1 1 2 2 3 3 4 4 5 5
##   ..$ x7 : num [1:10] 5 10 15 20 25 30 35 40 45 50
##   ..$ x8 : num [1:10] 0 0.301 0.477 0.602 0.699 ...
##   ..$ x9 : num [1:10] 1 8 27 64 125 216 343 512 729 1000
##   ..$ x10: logi [1:10] TRUE TRUE TRUE FALSE FALSE TRUE ...

Packages

Additional libraries can be installed and loaded for use.

install.packages("scales")
library(scales)
xx <- data.frame(Values = 1:10)
xx$Rescaled <- rescale(x = xx$Values, to = c(1,30))
xx
##    Values  Rescaled
## 1       1  1.000000
## 2       2  4.222222
## 3       3  7.444444
## 4       4 10.666667
## 5       5 13.888889
## 6       6 17.111111
## 7       7 20.333333
## 8       8 23.555556
## 9       9 26.777778
## 10     10 30.000000

libraries can also be used without having to load them

scales::rescale(1:10, to = c(1,30))
##  [1]  1.000000  4.222222  7.444444 10.666667 13.888889 17.111111 20.333333 23.555556 26.777778 30.000000

Data Wrangling

R for Data Science - https://r4ds.had.co.nz/

xx <- data.frame(Group = c("X","X","Y","Y","Y","X","X","X","Y","Y"),
                 Data1 = 1:10, 
                 Data2 = seq(10, 100, by = 10))
xx$NewData1 <- xx$Data1 + xx$Data2
xx$NewData2 <- xx$Data1 * 1000
xx
##    Group Data1 Data2 NewData1 NewData2
## 1      X     1    10       11     1000
## 2      X     2    20       22     2000
## 3      Y     3    30       33     3000
## 4      Y     4    40       44     4000
## 5      Y     5    50       55     5000
## 6      X     6    60       66     6000
## 7      X     7    70       77     7000
## 8      X     8    80       88     8000
## 9      Y     9    90       99     9000
## 10     Y    10   100      110    10000
xx$Data1 < 5 # which are less than 5
##  [1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
xx[xx$Data1 < 5,]
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx[xx$Group == "X", c("Group","Data2","NewData1")]
##   Group Data2 NewData1
## 1     X    10       11
## 2     X    20       22
## 6     X    60       66
## 7     X    70       77
## 8     X    80       88

Data wrangling with tidyverse and pipes (%>%)

library(tidyverse) # install.packages("tidyverse")
xx <- data.frame(Group = c("X","X","Y","Y","Y","Y","Y","X","X","X")) %>%
  mutate(Data1 = 1:10, 
         Data2 = seq(10, 100, by = 10),
         NewData1 = Data1 + Data2,
         NewData2 = Data1 * 1000)
xx
##    Group Data1 Data2 NewData1 NewData2
## 1      X     1    10       11     1000
## 2      X     2    20       22     2000
## 3      Y     3    30       33     3000
## 4      Y     4    40       44     4000
## 5      Y     5    50       55     5000
## 6      Y     6    60       66     6000
## 7      Y     7    70       77     7000
## 8      X     8    80       88     8000
## 9      X     9    90       99     9000
## 10     X    10   100      110    10000
filter(xx, Data1 < 5)
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx %>% filter(Data1 < 5)
##   Group Data1 Data2 NewData1 NewData2
## 1     X     1    10       11     1000
## 2     X     2    20       22     2000
## 3     Y     3    30       33     3000
## 4     Y     4    40       44     4000
xx %>% filter(Group == "X") %>% 
  select(Group, NewColName=Data2, NewData1)
##   Group NewColName NewData1
## 1     X         10       11
## 2     X         20       22
## 3     X         80       88
## 4     X         90       99
## 5     X        100      110
xs <- xx %>% 
  group_by(Group) %>% 
  summarise(Data2_mean = mean(Data2),
            Data2_sd = sd(Data2),
            NewData2_mean = mean(NewData2),
            NewData2_sd = sd(NewData2))
xs
## # A tibble: 2 × 5
##   Group Data2_mean Data2_sd NewData2_mean NewData2_sd
##   <chr>      <dbl>    <dbl>         <dbl>       <dbl>
## 1 X             60     41.8          6000       4183.
## 2 Y             50     15.8          5000       1581.
xx %>% left_join(xs, by = "Group")
##    Group Data1 Data2 NewData1 NewData2 Data2_mean Data2_sd NewData2_mean NewData2_sd
## 1      X     1    10       11     1000         60 41.83300          6000    4183.300
## 2      X     2    20       22     2000         60 41.83300          6000    4183.300
## 3      Y     3    30       33     3000         50 15.81139          5000    1581.139
## 4      Y     4    40       44     4000         50 15.81139          5000    1581.139
## 5      Y     5    50       55     5000         50 15.81139          5000    1581.139
## 6      Y     6    60       66     6000         50 15.81139          5000    1581.139
## 7      Y     7    70       77     7000         50 15.81139          5000    1581.139
## 8      X     8    80       88     8000         60 41.83300          6000    4183.300
## 9      X     9    90       99     9000         60 41.83300          6000    4183.300
## 10     X    10   100      110    10000         60 41.83300          6000    4183.300

Read/Write data

xx <- read.csv("data_r_tutorial.csv")
write.csv(xx, "data_r_tutorial.csv", row.names = F)

For excel sheets, the package readxl can be used to read in sheets of data.

library(readxl) # install.packages("readxl")
xx <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data")

Tidy Data

yy <- xx %>%
  group_by(Name, Location) %>%
  summarise(Mean_DTF = round(mean(DTF),1)) %>% 
  arrange(Location)
yy
## # A tibble: 9 × 3
## # Groups:   Name [3]
##   Name          Location            Mean_DTF
##   <chr>         <chr>                  <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh     86.7
## 2 ILL 618 AGL   Jessore, Bangladesh     79.3
## 3 Laird AGL     Jessore, Bangladesh     76.8
## 4 CDC Maxim AGL Metaponto, Italy       134. 
## 5 ILL 618 AGL   Metaponto, Italy       138. 
## 6 Laird AGL     Metaponto, Italy       137. 
## 7 CDC Maxim AGL Saskatoon, Canada       52.5
## 8 ILL 618 AGL   Saskatoon, Canada       47  
## 9 Laird AGL     Saskatoon, Canada       56.8
yy <- yy %>% spread(key = Location, value = Mean_DTF)
yy
## # A tibble: 3 × 4
## # Groups:   Name [3]
##   Name          `Jessore, Bangladesh` `Metaponto, Italy` `Saskatoon, Canada`
##   <chr>                         <dbl>              <dbl>               <dbl>
## 1 CDC Maxim AGL                  86.7               134.                52.5
## 2 ILL 618 AGL                    79.3               138.                47  
## 3 Laird AGL                      76.8               137.                56.8
yy <- yy %>% gather(key = TraitName, value = Value, 2:4)
yy
## # A tibble: 9 × 3
## # Groups:   Name [3]
##   Name          TraitName           Value
##   <chr>         <chr>               <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh  86.7
## 2 ILL 618 AGL   Jessore, Bangladesh  79.3
## 3 Laird AGL     Jessore, Bangladesh  76.8
## 4 CDC Maxim AGL Metaponto, Italy    134. 
## 5 ILL 618 AGL   Metaponto, Italy    138. 
## 6 Laird AGL     Metaponto, Italy    137. 
## 7 CDC Maxim AGL Saskatoon, Canada    52.5
## 8 ILL 618 AGL   Saskatoon, Canada    47  
## 9 Laird AGL     Saskatoon, Canada    56.8
yy <- yy %>% spread(key = Name, value = Value)
yy
## # A tibble: 3 × 4
##   TraitName           `CDC Maxim AGL` `ILL 618 AGL` `Laird AGL`
##   <chr>                         <dbl>         <dbl>       <dbl>
## 1 Jessore, Bangladesh            86.7          79.3        76.8
## 2 Metaponto, Italy              134.          138.        137. 
## 3 Saskatoon, Canada              52.5          47          56.8

Base Plotting

We will start with some basic plotting using the base function plot()

# A basic scatter plot
plot(x = xd$x8, y = xd$x9)

# Adjust color and shape of the points
plot(x = xd$x8, y = xd$x9, col = "darkred", pch = 0)

plot(x = xd$x8, y = xd$x9, col = xd$x4, pch = xd$x4)

# Adjust plot type 
plot(x = xd$x8, y = xd$x9, type = "line")

# Adjust linetype
plot(x = xd$x8, y = xd$x9, type = "line", lty = 2)

# Plot lines and points
plot(x = xd$x8, y = xd$x9, type = "both")

Now lets create some random and normally distributed data to make some more complicated plots

# 100 random uniformly distributed numbers ranging from 0 - 100
ru <- runif(100, min = 0, max = 100)
ru
##   [1] 53.162858 87.147356 11.363108  1.008642 12.146549 12.639364 18.309311 70.684050 26.797678  3.948609 39.007204 35.530865 35.563558  4.293931 48.087959 22.052860
##  [17] 95.216234 25.940807 18.448122 78.060231 73.784667 14.817104 40.382780 87.602320 48.886309 15.037874 17.707382  1.175332 12.613008 45.739346 15.369641 27.276645
##  [33] 78.821396 87.602818 11.501322 84.700034 13.129359 41.180081 23.478617 16.571845 63.251767 23.725627 15.745435 26.417690 44.575378 55.744648 98.273327 15.486392
##  [49] 72.736957 97.116735 13.940902 95.100780 68.912585 98.801836 52.550269 74.529695 96.081035 63.386883 20.853426 15.356134 26.900610 42.022534 94.241965 89.358755
##  [65] 26.591405 89.274616 81.734480 11.885772 33.046302 57.019159 64.546086 88.147427 21.542646 57.130057 36.663192  8.163097 70.729113 45.525576 28.744052 73.718913
##  [81] 62.515881 66.204752 44.034702  2.398333 26.569519 91.850530 11.821732 59.258192 70.126079 72.537270 17.637165 76.702396 34.076774 51.534845 75.646639 64.876584
##  [97]  8.278898 74.361697 13.191981 39.471138
plot(x = ru)

order(ru)
##   [1]   4  28  84  10  14  76  97   3  35  87  68   5  29   6  37  99  51  22  26  60  31  48  43  40  91  27   7  19  59  73  16  39  42  18  44  85  65   9  61  32
##  [41]  79  69  93  12  13  75  11 100  23  38  62  83  45  78  30  15  25  94  55   1  46  70  74  88  81  41  58  71  96  82  53  89   8  77  90  49  80  21  98  56
##  [81]  95  92  20  33  67  36   2  24  34  72  66  64  86  63  52  17  57  50  47  54
ru<- ru[order(ru)]
ru
##   [1]  1.008642  1.175332  2.398333  3.948609  4.293931  8.163097  8.278898 11.363108 11.501322 11.821732 11.885772 12.146549 12.613008 12.639364 13.129359 13.191981
##  [17] 13.940902 14.817104 15.037874 15.356134 15.369641 15.486392 15.745435 16.571845 17.637165 17.707382 18.309311 18.448122 20.853426 21.542646 22.052860 23.478617
##  [33] 23.725627 25.940807 26.417690 26.569519 26.591405 26.797678 26.900610 27.276645 28.744052 33.046302 34.076774 35.530865 35.563558 36.663192 39.007204 39.471138
##  [49] 40.382780 41.180081 42.022534 44.034702 44.575378 45.525576 45.739346 48.087959 48.886309 51.534845 52.550269 53.162858 55.744648 57.019159 57.130057 59.258192
##  [65] 62.515881 63.251767 63.386883 64.546086 64.876584 66.204752 68.912585 70.126079 70.684050 70.729113 72.537270 72.736957 73.718913 73.784667 74.361697 74.529695
##  [81] 75.646639 76.702396 78.060231 78.821396 81.734480 84.700034 87.147356 87.602320 87.602818 88.147427 89.274616 89.358755 91.850530 94.241965 95.100780 95.216234
##  [97] 96.081035 97.116735 98.273327 98.801836
plot(x = ru)

# 100 normally distributed numbers with a mean of 50 and sd of 10
nd <- rnorm(100, mean = 50, sd = 10)
nd
##   [1] 61.04582 65.00405 48.68102 67.01738 65.59204 63.87669 47.22183 55.08998 66.83061 46.09239 47.68251 63.48991 41.20624 49.24514 37.20069 49.67784 66.54418
##  [18] 59.78129 58.98972 45.39165 33.33735 49.54825 25.47123 44.39026 58.80216 41.72097 62.95160 43.88261 58.01719 49.68180 58.34707 61.79853 56.83738 45.67125
##  [35] 51.68285 52.27516 50.50951 41.48843 45.46594 50.66981 57.69684 39.43399 30.40823 44.93101 44.50828 63.51967 59.56001 27.09811 56.26970 49.33395 40.64167
##  [52] 32.55277 46.12448 46.49444 44.83212 37.32876 54.36506 51.73099 52.91481 34.78461 63.21379 65.68372 70.32207 41.95195 66.22669 48.92783 55.66101 59.28977
##  [69] 45.65475 55.65787 47.65649 44.35212 47.06991 39.66171 44.43534 50.20904 50.16913 61.74536 53.12531 58.52444 48.97543 37.83122 65.55682 64.48552 57.37831
##  [86] 47.66658 35.51125 58.97663 39.58757 56.62916 42.63047 57.78908 52.12737 49.21467 58.53158 65.82825 57.60140 50.51799 71.42883 60.07154
nd <- nd[order(nd)]
nd
##   [1] 25.47123 27.09811 30.40823 32.55277 33.33735 34.78461 35.51125 37.20069 37.32876 37.83122 39.43399 39.58757 39.66171 40.64167 41.20624 41.48843 41.72097
##  [18] 41.95195 42.63047 43.88261 44.35212 44.39026 44.43534 44.50828 44.83212 44.93101 45.39165 45.46594 45.65475 45.67125 46.09239 46.12448 46.49444 47.06991
##  [35] 47.22183 47.65649 47.66658 47.68251 48.68102 48.92783 48.97543 49.21467 49.24514 49.33395 49.54825 49.67784 49.68180 50.16913 50.20904 50.50951 50.51799
##  [52] 50.66981 51.68285 51.73099 52.12737 52.27516 52.91481 53.12531 54.36506 55.08998 55.65787 55.66101 56.26970 56.62916 56.83738 57.37831 57.60140 57.69684
##  [69] 57.78908 58.01719 58.34707 58.52444 58.53158 58.80216 58.97663 58.98972 59.28977 59.56001 59.78129 60.07154 61.04582 61.74536 61.79853 62.95160 63.21379
##  [86] 63.48991 63.51967 63.87669 64.48552 65.00405 65.55682 65.59204 65.68372 65.82825 66.22669 66.54418 66.83061 67.01738 70.32207 71.42883
plot(x = nd)

hist(x = nd)

hist(nd, breaks = 20, col = "darkgreen")

plot(x = density(nd))

boxplot(x = nd)

boxplot(x = nd, horizontal = T)


ggplot2

Lets be honest, the base plots are ugly! The ggplot2 package gives the user to create a better, more visually appealing plots. Additional packages such as ggbeeswarm and ggrepel also contain useful functions to add to the functionality of ggplot2.

library(ggplot2)
mp <- ggplot(xd, aes(x = x8, y = x9))
mp + geom_point()

mp + geom_point(aes(color = x3, shape = x3), size = 4)

mp + geom_line(size = 2)

mp + geom_line(aes(color = x3), size = 2)

mp + geom_smooth(method = "loess")

mp + geom_smooth(method = "lm")

xx <- data.frame(data = c(rnorm(50, mean = 40, sd = 10),
                          rnorm(50, mean = 60, sd = 5)),
                 group = factor(rep(1:2, each = 50)),
                 label = c("Label1", rep(NA, 49), "Label2", rep(NA, 49)))
mp <- ggplot(xx, aes(x = data, fill = group))
mp + geom_histogram(color = "black")

mp + geom_histogram(color = "black", position = "dodge")

mp1 <- mp + geom_histogram(color = "black") + facet_grid(group~.)
mp1

mp + geom_density(alpha = 0.5)

mp <- ggplot(xx, aes(x = group, y = data, fill = group))
mp + geom_boxplot(color = "black")

mp + geom_boxplot() + geom_point()

mp + geom_violin() + geom_boxplot(width = 0.1, fill = "white")

library(ggbeeswarm)
mp + geom_quasirandom()

mp + geom_quasirandom(aes(shape = group))

mp2 <- mp + geom_violin() + 
  geom_boxplot(width = 0.1, fill = "white") +
  geom_beeswarm(alpha = 0.5)
library(ggrepel)
mp2 + geom_text_repel(aes(label = label), nudge_x = 0.4)

library(ggpubr)
ggarrange(mp1, mp2, ncol = 2, widths = c(2,1),
          common.legend = T, legend = "bottom")


Statistics

# Prep data
lev_Loc  <- c("Saskatoon, Canada", "Jessore, Bangladesh", "Metaponto, Italy")
lev_Name <- c("ILL 618 AGL", "CDC Maxim AGL", "Laird AGL")
dd <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data") %>%
  mutate(Location = factor(Location, levels = lev_Loc),
         Name = factor(Name, levels = lev_Name))
xx <- dd %>%
  group_by(Name, Location) %>%
  summarise(Mean_DTF = mean(DTF))
xx %>% spread(Location, Mean_DTF)
## # A tibble: 3 × 4
## # Groups:   Name [3]
##   Name          `Saskatoon, Canada` `Jessore, Bangladesh` `Metaponto, Italy`
##   <fct>                       <dbl>                 <dbl>              <dbl>
## 1 ILL 618 AGL                  47                    79.3               138.
## 2 CDC Maxim AGL                52.5                  86.7               134.
## 3 Laird AGL                    56.8                  76.8               137.
# Plot
mp1 <- ggplot(dd, aes(x = Location, y = DTF, color = Name, shape = Name)) +
  geom_point(size = 2, alpha = 0.7, position = position_dodge(width=0.5))
mp2 <- ggplot(xx, aes(x = Location, y = Mean_DTF, 
                      color = Name, group = Name, shape = Name)) +
  geom_point(size = 2.5, alpha = 0.7) + 
  geom_line(size = 1, alpha = 0.7) +
  theme(legend.position = "top")
ggarrange(mp1, mp2, ncol = 2, common.legend = T, legend = "top")

From first glace, it is clear there are differences between genotypes, locations, and genotype x environment (GxE) interactions. Now let’s do a few statistical tests.

summary(aov(DTF ~ Name * Location, data = dd))
##               Df Sum Sq Mean Sq  F value   Pr(>F)    
## Name           2     88      44    3.476   0.0395 *  
## Location       2  65863   32931 2598.336  < 2e-16 ***
## Name:Location  4    560     140   11.044 2.52e-06 ***
## Residuals     45    570      13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As expected, an ANOVA shows statistical significance for genotype (p-value = 0.0395), Location (p-value < 2e-16) and GxE interactions (p-value < 2.52e-06). However, all this tells us is that one genotype is different from the rest, one location is different from the others and that there is GxE interactions. If we want to be more specific, would need to do some multiple comparison tests.

If we only have two things to compare, we could do a t-test.

xx <- dd %>% 
  filter(Location %in% c("Saskatoon, Canada", "Jessore, Bangladesh")) %>%
  spread(Location, DTF)
t.test(x = xx$`Saskatoon, Canada`, y = xx$`Jessore, Bangladesh`)
## 
##  Welch Two Sample t-test
## 
## data:  xx$`Saskatoon, Canada` and xx$`Jessore, Bangladesh`
## t = -17.521, df = 32.701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -32.18265 -25.48402
## sample estimates:
## mean of x mean of y 
##  52.11111  80.94444

DTF in Saskatoon, Canada is significantly different (p-value < 2.2e-16) from DTF in Jessore, Bangladesh.

xx <- dd %>% 
  filter(Name %in% c("ILL 618 AGL", "Laird AGL"),
         Location == "Metaponto, Italy") %>%
  spread(Name, DTF)
t.test(x = xx$`ILL 618 AGL`, y = xx$`Laird AGL`)
## 
##  Welch Two Sample t-test
## 
## data:  xx$`ILL 618 AGL` and xx$`Laird AGL`
## t = 0.38008, df = 8.0564, p-value = 0.7137
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.059739  7.059739
## sample estimates:
## mean of x mean of y 
##  137.8333  136.8333

DTF between ILL 618 AGL and Laird AGL are not significantly different (p-value = 0.7137) in Metaponto, Italy.


pch Plot

xx <- data.frame(x = rep(1:6, times = 5, length.out = 26),
                 y = rep(5:1, each = 6, length.out = 26),
                 pch = 0:25)
mp <- ggplot(xx, aes(x = x, y = y, shape = as.factor(pch))) +
  geom_point(color = "darkred", fill = "darkblue", size = 5) +
  geom_text(aes(label = pch), nudge_x = -0.25) +
  scale_shape_manual(values = xx$pch) +
  scale_x_continuous(breaks = 6:1) +
  scale_y_continuous(breaks = 6:1) +
  theme_void() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text = element_blank(),
        axis.ticks = element_blank()) +
  labs(title = "Plot symbols in R (pch)",
       subtitle = "color = \"darkred\", fill = \"darkblue\"",
       x = NULL, y = NULL)
ggsave("pch.png", mp, width = 4.5, height = 3, bg = "white")


R Markdown

Tutorials on how to create an R markdown document like this one can be found here:


© Derek Michael Wright